from stanfordcorenlp import StanfordCoreNLP
import re
import pickle
import jieba

"""nlp = StanfordCoreNLP('E:/stanford-corenlp-full-2016-10-31', lang='en')

with open('train_en_clean','r',encoding='utf-8') as f:
    data_en = f.readlines()


f = open("en_split.pickle", "wb")

data_en_split = []
for line_num in range(len(data_en)):
    temp = nlp.word_tokenize(data_en[line_num])
    #print(temp)
    #print(temp)
    en_split = []
    for i in temp:
        # 过滤掉所有的标点符号
        i = re.sub(r'\\t|\\n','',i)
        if len(i) > 0:
            en_split.append(i)


    data_en_split.append(en_split)

pickle.dump(data_en_split,f)"""

with open('train_zh_clean','r',encoding='utf-8') as f:
    data_zh = f.readlines()


f = open("zh_split.pickle", "wb")

data_zh_split = []
for line_num in range(len(data_zh)):
    temp = jieba.lcut(data_zh[line_num])
    #print(temp)
    #print(temp)
    zh_split = []
    for i in temp:
        # 过滤掉所有的标点符号
        i = re.sub(r'\s+','',i)
        if len(i) > 0:
            zh_split.append(i)


    data_zh_split.append(zh_split)

pickle.dump(data_zh_split,f)